# dependencies
library(tidyverse)
library(effectsize)
library(pwr)
library(irr)
library(metafor)
library(forestploter)
library(janitor)
library(greekLetters)
library(knitr)
library(kableExtra)
dir.create("plots")
# function to round all numeric vars in a data frame
round_df <- function(df, n_digits = 3) {
df %>% mutate_if(is.numeric, janitor::round_half_up, digits = n_digits)
}
# apa format p value -----
apa_p_value <- function(p){
p_formatted <- ifelse(p >= 0.001, paste("=", round_half_up(p, 3)),
ifelse(p < 0.001, "< .001", NA))
p_formatted <- gsub(pattern = "0.", replacement = ".", x = p_formatted, fixed = TRUE)
p_formatted
}
# meta results string
meta_effect_string <- function(fit, predictions){
paste0("k = ", fit$k, ", r = ", predictions$pred,
", 95% CI [", predictions$ci.lb, ", ", predictions$ci.ub, "]",
", 95% CR [", predictions$cr.lb, ", ", predictions$cr.ub, "]",
", 95% PI [", predictions$pi.lb, ", ", predictions$pi.ub, "]",
", p = ", signif(2*pnorm(-abs(fit$zval)), digits = 1)) # exact p value from z score
}
# notation off
options(scipen = 999)
# plot theme
custom_theme <-
forest_theme(base_size = 10,
ci_lty = 1,
ci_lwd = 1.5,
ci_Theight = 0.2,
summary_fill = "black",
summary_col = "black",
footnote_cex = 0.8)
# get data
## Vahey et al.'s effect sizes
data_vahey <- read.csv("../data/data_vahey_et_al_2015.csv")
## get effect sizes reextracted from the original articles
data_reextracted <- read.csv("../data/data_reextracted_and_annotated.csv") %>%
# remove effects that merely repeat what vahey found, which are replaced by other rows that do new extractions for a corrispondonding effect.
filter(replaced_effect == FALSE) %>%
# mark each effect as clinically relevant only if both raters scored it as such
rowwise() %>%
mutate(clinically_relevant =
as.logical(max(c(clinically_relevant_criterion_rater_1,
clinically_relevant_criterion_rater_2)))) %>%
ungroup()
### remove non-criterion effects
data_reextracted_criterion <- data_reextracted %>%
filter(criterion_effect == TRUE)
### remove non-clinical criterion variables
data_reextracted_criterion_clinically_relevant <- data_reextracted_criterion %>%
filter(clinically_relevant == TRUE)
## sample sizes in published IRAP research, taken from a separate project
data_sample_sizes <- read_csv("../data/sample size data/data_irap.csv")
data_sample_sizes_vector <- data_sample_sizes |>
drop_na(n_participants_after_exclusions) |>
pull(n_participants_after_exclusions)Calculated from meta analytic effect sizes reported in forest plot and text.
Vahey et al.’s reported meta effect size estimate was r = .45, 95% CI [.40, .54], 95% CR [.23, .67]. Note that this is quite a large effect size, equivalent to Cohen’s d = 1.01.
Using this effect size, they conducted power analyses.
I used the R packages pwr and effectsize to attempt to reproduce these values.
data_power_verified <-
tibble(
test = c("Pearson’s r",
"Pearson’s r",
"Pearson’s r",
"Pearson’s r",
"Independent t-test (Cohen's d)",
"Independent t-test (Cohen's d)",
"Dependent t-test (Cohen's d)",
"Dependent t-test (Cohen's d)"),
tails = c("One-tailed",
"One-tailed",
"Two-tailed",
"Two-tailed",
"One-tailed",
"One-tailed",
"One-tailed",
"One-tailed"),
estimate = c("Point estimate",
"Lower bound of 95% Confidence Interval",
"Point estimate",
"Lower bound of 95% Confidence Interval",
"Point estimate",
"Lower bound of 95% Confidence Interval",
"Point estimate",
"Lower bound of 95% Confidence Interval"),
effect_size_vahey = c(0.45, 0.4, 0.45, 0.4, 1.01, 0.87, 1.01, 0.87),
required_n_vahey = c(29L, 37L, 36L, NA, 26L, 36L, 8L, 10L)
)
point_estimate_vahey <- .45
lower_ci_vahey <- .40
data_power_verified$required_n_verified[1] <-
pwr.r.test(n = NULL,
r = point_estimate_vahey,
sig.level = 0.05,
power = 0.8,
alternative = "greater")$n %>%
ceiling()
data_power_verified$required_n_verified[2] <-
pwr.r.test(n = NULL,
r = lower_ci_vahey,
sig.level = 0.05,
power = 0.8,
alternative = "greater")$n %>%
ceiling()
data_power_verified$required_n_verified[3] <-
pwr.r.test(n = NULL,
r = point_estimate_vahey,
sig.level = 0.05,
power = 0.8,
alternative = "two.sided")$n %>%
ceiling()
data_power_verified$required_n_verified[4] <-
pwr.r.test(n = NULL,
r = lower_ci_vahey,
sig.level = 0.05,
power = 0.8,
alternative = "two.sided")$n %>%
ceiling()
data_power_verified$required_n_verified[5] <-
pwr.t.test(n = NULL,
d = r_to_d(point_estimate_vahey),
type = "two.sample",
sig.level = 0.05,
power = 0.8,
alternative = "greater")$n %>%
ceiling()*2
data_power_verified$required_n_verified[6] <-
pwr.t.test(n = NULL,
d = r_to_d(lower_ci_vahey),
type = "two.sample",
sig.level = 0.05,
power = 0.8,
alternative = "greater")$n %>%
ceiling()*2
data_power_verified$required_n_verified[7] <-
pwr.t.test(n = NULL,
d = r_to_d(point_estimate_vahey),
type = "paired",
sig.level = 0.05,
power = 0.8,
alternative = "greater")$n %>%
ceiling()
data_power_verified$required_n_verified[8] <-
pwr.t.test(n = NULL,
d = r_to_d(lower_ci_vahey),
type = "paired",
sig.level = 0.05,
power = 0.8,
alternative = "greater")$n %>%
ceiling()
data_power_verified %>%
kable() %>%
kable_classic(full_width = FALSE)| test | tails | estimate | effect_size_vahey | required_n_vahey | required_n_verified |
|---|---|---|---|---|---|
| Pearson’s r | One-tailed | Point estimate | 0.45 | 29 | 29 |
| Pearson’s r | One-tailed | Lower bound of 95% Confidence Interval | 0.40 | 37 | 37 |
| Pearson’s r | Two-tailed | Point estimate | 0.45 | 36 | 36 |
| Pearson’s r | Two-tailed | Lower bound of 95% Confidence Interval | 0.40 | NA | 46 |
| Independent t-test (Cohen’s d) | One-tailed | Point estimate | 1.01 | 26 | 26 |
| Independent t-test (Cohen’s d) | One-tailed | Lower bound of 95% Confidence Interval | 0.87 | 36 | 34 |
| Dependent t-test (Cohen’s d) | One-tailed | Point estimate | 1.01 | 8 | 8 |
| Dependent t-test (Cohen’s d) | One-tailed | Lower bound of 95% Confidence Interval | 0.87 | 10 | 10 |
Calculated from weighted average effect sizes reported in forest plot.
The power analyses relied on the accuracy of the meta-analytic effect size. So, I then attempted to computationally reproduce the meta-analytic effect size from the effect sizes, confidence intervals, credibility intervals, and sample sizes reported in Vahey et al.’s forest plot and their description of their meta-analytic method in their manuscript.
In an email, Vahey stated that they performed a Hunter & Schmidt meta analysis, following the guide of Field & Gillett (2010) and using their code.
On inspection, Field & Gillett (2010) make a distinction between the Hunter & Schmidt method, which is distinctive for reporting credibility intervals rather than confidence intervals, and the Hedges and colleagues’ method, which is distinctive for using Fisher’s r-to-z transformations.
Vahey et al. (2015) do not report applying any transformations to their data. However, Vahey et al.’s (2015) individual effect sizes in their forest plot have asymmetric confidence intervals. This implies that a transformation was performed. It is therefore possible that Vahey et al. combined the two methods (and code) provided by Field & Gillett (2010). I first fit a Hunter & Schmidt method, then a combined Hunter & Schmidt and Hedges’ and colleagues method.
Information about implementation of Hunter & Schmidt-style meta-analysis in metafor from here.
total_n <- data_vahey %>%
distinct(article, n_forest_plot) %>%
drop_na() %>%
summarize(total_n = sum(n_forest_plot, na.rm = TRUE)) %>%
pull(total_n)
# calculate variances
data_recalculated_variance <-
escalc(measure = "COR",
ri = mean_r_forest_plot_numeric,
ni = n_forest_plot,
data = data_vahey,
vtype = "AV",
digits = 10) %>%
drop_na() %>%
select(article, article_abbreviated,
yi, vi, ni = n_forest_plot) %>%
mutate(lower = yi - sqrt(vi)*1.96,
upper = yi + sqrt(vi)*1.96)
# fit model
fit_reproduced <-
rma(yi = yi,
vi = vi,
weights = ni, # Hunter Schmidt method requires weighting by N
method = "HS", # Hunter Schmidt method
data = data_recalculated_variance,
slab = article_abbreviated,
digits = 10)
predictions_reproduced <-
predict(fit_reproduced) %>%
as.data.frame() %>%
select(-se) %>%
# cr following Field & Gillett (2010) equation 5, where s_r^2 == tau2 as they both estimate the variance in population correlations. e.g., see Field's script "h_s syntax.sps" which refers to this as both tau and s_r^2 depending on the meta analytic approach
mutate(cr.lb = pred - 1.96*sqrt(fit_reproduced$tau2),
cr.ub = pred + 1.96*sqrt(fit_reproduced$tau2))
predictions_reproduced_printing <- predictions_reproduced %>%
round_df(2)
# predictions_reproduced %>%
# round_df(2) %>%
# kable() %>%
# kable_classic(full_width = FALSE)
# plot
data_forest <- data_recalculated_variance %>%
drop_na() %>%
select(article_abbreviated,
n = ni,
r = yi,
lower,
upper) %>%
# bind_rows(tibble(article_abbreviated = "Meta",
# n = 35, # arbitrary number to make the diamond an appropriate size
# r = predictions_reproduced$pred,
# lower = predictions_reproduced$pi.lb,
# upper = predictions_reproduced$pi.ub)) %>%
bind_rows(tibble(article_abbreviated = c("Meta (confidence interval)",
"Meta (credibility interval)"),
n = 35, # arbitrary number to make the diamond an appropriate size
r = c(predictions_reproduced$pred,
predictions_reproduced$pred),
lower = c(predictions_reproduced$ci.lb,
predictions_reproduced$cr.lb),
upper = c(predictions_reproduced$ci.ub,
predictions_reproduced$cr.ub))) %>%
mutate(size = n/sum(n),
n = ifelse(str_detect(article_abbreviated, "Meta"), total_n, n),
article_abbreviated = fct_relevel(article_abbreviated,
"Meta (confidence interval)",
"Meta (credibility interval)")) %>%
mutate(` ` = paste(rep(" ", 30), collapse = " ")) %>%
select(Article = article_abbreviated, ` `, r, lower, upper, n, size) %>%
round_df(2)
p_reproduced <-
forestploter::forest(data = select(data_forest, -size),
est = data_forest$r,
lower = data_forest$lower,
upper = data_forest$upper,
sizes = 1/data_forest$size,
#is_summary = c(rep(FALSE, nrow(data_forest)-1), TRUE),
is_summary = c(rep(FALSE, nrow(data_forest)-2), TRUE, TRUE),
ci_column = 2,
ref_line = 0,
theme = custom_theme,
xlim = c(-0.3, 1.1),
ticks_at = c(-.2, 0, .2, .4, .6, .8, 1.0))
p_reproducedggplot2::ggsave(filename = "plots/forest plot Hunter & Schmidt method.pdf",
plot = p_reproduced,
device = "pdf",
width = 7.5,
height = 4.5,
units = "in")Meta effect: k = 15, r = 0.47, 95% CI [0.4, 0.54], 95% CR [0.47, 0.47], 95% PI [0.4, 0.54], p = 0.000000000000000000000000000000000000004.
Fisher’s r-to-z transformation and back transformation, with HS estimator. No Overton (1998) transformation.
# calculate variances
data_recalculated_variance_transformed <- data_vahey %>%
# # Overton transformation. Unnumbered equation between equations 8 and 9 in Field & Gillett (2010)
# mutate(mean_r_forest_plot_numeric = mean_r_forest_plot_numeric - (mean_r_forest_plot_numeric*(1-mean_r_forest_plot_numeric^2))/(2*(n_forest_plot-3))) %>%
escalc(measure = "ZCOR",
ri = mean_r_forest_plot_numeric,
ni = n_forest_plot,
data = .,
vtype = "AV",
digits = 10) %>%
drop_na() %>%
select(article, article_abbreviated,
yi, vi, ni = n_forest_plot) %>%
mutate(lower = yi - sqrt(vi)*1.96,
upper = yi + sqrt(vi)*1.96)
# fit model
fit_reproduced_transformed <-
rma(yi = yi,
vi = vi,
weights = ni, # Hunter Schmidt method requires weighting by N
method = "HS", # Hunter Schmidt method
data = data_recalculated_variance_transformed,
slab = article_abbreviated,
digits = 10)
predictions_reproduced_transformed <-
predict(fit_reproduced_transformed) %>%
as.data.frame() %>%
select(-se) %>%
# cr following Field & Gillett (2010) equation 5, where s_r^2 == tau2 as they both estimate the variance in population correlations. e.g., see Field's script "h_s syntax.sps" which refers to this as both tau and s_r^2 depending on the meta analytic approach
mutate(cr.lb = pred - 1.96*sqrt(fit_reproduced_transformed$tau2),
cr.ub = pred + 1.96*sqrt(fit_reproduced_transformed$tau2))
predictions_reproduced_transformed_backtransformed <-
predictions_reproduced_transformed %>%
mutate_all(transf.ztor) %>%
round_df(2)
# plot
data_forest_transformed <- data_recalculated_variance_transformed %>%
select(article_abbreviated,
n = ni,
r = yi,
lower,
upper) %>%
# bind_rows(tibble(article_abbreviated = "Meta",
# n = 35, # arbitrary number to make the diamond an appropriate size
# r = predictions_reproduced_transformed$pred,
# lower = predictions_reproduced_transformed$pi.lb,
# upper = predictions_reproduced_transformed$pi.ub)) %>%
bind_rows(tibble(article_abbreviated = c("Meta (confidence interval)",
"Meta (credibility interval)"),
n = 35, # arbitrary number to make the diamond an appropriate size
r = c(predictions_reproduced_transformed$pred,
predictions_reproduced_transformed$pred),
lower = c(predictions_reproduced_transformed$ci.lb,
predictions_reproduced_transformed$cr.lb),
upper = c(predictions_reproduced_transformed$ci.ub,
predictions_reproduced_transformed$cr.ub))) %>%
mutate(r = transf.ztor(r),
lower = transf.ztor(lower),
upper = transf.ztor(upper)) %>%
mutate(size = n/sum(n),
n = ifelse(str_detect(article_abbreviated, "Meta"), total_n, n),
article_abbreviated = fct_relevel(article_abbreviated,
"Meta (confidence interval)",
"Meta (credibility interval)")) %>%
mutate(` ` = paste(rep(" ", 30), collapse = " ")) %>%
select(Article = article_abbreviated, ` `, r, lower, upper, n, size) %>%
round_df(2)
p_reproduced_transformed <-
forestploter::forest(data = select(data_forest_transformed, -size),
est = data_forest_transformed$r,
lower = data_forest_transformed$lower,
upper = data_forest_transformed$upper,
sizes = 1/data_forest_transformed$size,
#is_summary = c(rep(FALSE, nrow(data_forest)-1), TRUE),
is_summary = c(rep(FALSE, nrow(data_forest)-2), TRUE, TRUE),
ci_column = 2,
ref_line = 0,
theme = custom_theme,
xlim = c(-0.3, 1.1),
ticks_at = c(-.2, 0, .2, .4, .6, .8, 1.0))
p_reproduced_transformedggplot2::ggsave(filename = "plots/forest plot Hunter & Schmidt method with Fisher's r-to-z transformations.pdf",
plot = p_reproduced_transformed,
device = "pdf",
width = 7.5,
height = 4.5,
units = "in")Meta effect: k = 15, r = 0.47, 95% CI [0.4, 0.54], 95% CR [0.47, 0.47], 95% PI [0.4, 0.54], p = 0.000000000000000000000000002.
Vahey et al. (2015) noted that ““The funnel plot … indicated that this meta-effect was not subject to publication bias.” (Vahey et al., 2015, p. 59)“. However, the Cochrane handbook (since at least version 5, which was published in 2011, prior to the preparation of Vahey et al.’s manuscript) notes in section 10.4.3.1”Recommendations on testing for funnel plot asymmetry” that “review authors should remember that, because the tests typically have relatively low power, even when a test does not provide evidence of funnel plot asymmetry, bias (including publication bias) cannot be excluded.” (https://handbook-5-1.cochrane.org/chapter_10/10_4_3_1_recommendations_on_testing_for_funnel_plot_asymmetry.htm). That is, as with p values, absence of evidence (of bias) does not represent evidence of absence (of bias). As such, and especially in light of Vahey et al. including only 15 averaged effect sizes, it would have been more appropriate to conclude that there was no evidence of publication bias rather than there was evidence of no publication bias.
The meta analysis doesn’t mention r-to-z transformations and back transformations, and the SPSS script that Vahey stated he used didn’t employ them, but Vahey et al.’s funnel plot is stated to include “A funnel plot of Fisher Z-transformed (standardised) versions of the 15 independent criterion r effects included in the meta-analysis (Zr) versus their corresponding standard errors”.
metafor::funnel(fit_reproduced_transformed,
at = c(0, .2, .4, .6, .8, 1),
ylim = c(0.35, 0),
xlab = expression(italic(Z)[r]),
#ylab = expression(Standard ~ Error ~ of ~ italic(Z)[r]),
steps = 8,
#main = "Standard Error",
back = "white",
pch = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 19, 1, 1, 1, 1, 1))
#col = c("darkgrey", "darkgrey", "darkgrey", "darkgrey", "darkgrey", "darkgrey", "darkgrey", "darkgrey", "darkgrey", "black", "darkgrey", "darkgrey", "darkgrey", "darkgrey", "darkgrey"))
points(x = .543, y = .158, pch = 8, col = "red")The location of one data point is different in the recreation of the funnel plot from the values listed in their forest plot (transformed using Fischer’s r-to-z transformation, based on the description of the plot in Vahey et al. 2015), i.e., the effect for Nicholson, McCourt & Barnes-Holmes (2013). This effect is highlighted in the plot in black. In Vahey et al. (2015) this effect appeared approximately in the location marked with the red asterisk.
Vahey et al. (2015) stated: “We confirmed this [absence of publication bias] by conducting a Vevea and Wood’s (2005) sensitivity analysis which systematically examined the impact that the following range of publication bias scenarios would have had on our random effects meta-analysis: ‘moderate onetailed selection’, ‘moderate two-tailed selection’, ‘severe onetailed selection’ and ‘severe two-tailed selection’. Given that the meta-effect obtained with the current dataset varied by very little regardless of which form of publication bias we modelled with the Vevea and Woods method, it suggests that the current metaanalysis was not subject to publication bias (i.e. the various scenarios all reduced the size of the meta-effect r by the very small respective amounts of .007, .007, .016, .016).” (Vahey et al., 2015, p. 62)
tryCatch(
{
# code taken from metafor::selmodel() to implement vevea & woods (2005) selection models as described in Vahey et al. (2015)
tab <- data.frame(
steps = c(0.005, 0.01, 0.05, 0.10, 0.25, 0.35, 0.50, 0.65, 0.75, 0.90, 0.95, 0.99, 0.995, 1),
delta.mod.1 = c(1, 0.99, 0.95, 0.80, 0.75, 0.65, 0.60, 0.55, 0.50, 0.50, 0.50, 0.50, 0.50, 0.50),
delta.mod.2 = c(1, 0.99, 0.95, 0.90, 0.80, 0.75, 0.60, 0.60, 0.75, 0.80, 0.90, 0.95, 0.99, 1.00),
delta.sev.1 = c(1, 0.99, 0.90, 0.75, 0.60, 0.50, 0.40, 0.35, 0.30, 0.25, 0.10, 0.10, 0.10, 0.10),
delta.sev.2 = c(1, 0.99, 0.90, 0.75, 0.60, 0.50, 0.25, 0.25, 0.50, 0.60, 0.75, 0.90, 0.99, 1.00))
sel <- lapply(
tab[-1], function(delta) {
selmodel(fit_reproduced_transformed,
type = "stepfun",
steps = tab$steps,
delta = delta)
}
)
res <- transf.ztor(c(fit_reproduced_transformed$beta, sapply(sel, function(x) x$beta)))
deltas = res - res[1]
tibble(model = c("moderate onetailed selection", "moderate two-tailed selection", "severe onetailed selection", "severe two-tailed selection"),
delta_es_vahey = c(.007, .007, .016, .016),
abs_delta_es_recalculated = abs(janitor::round_half_up(deltas[-1], 3))) |>
mutate(reproduced = delta_es_vahey==abs_delta_es_recalculated) |>
kable() |>
kable_classic(full_width = FALSE)
},
error = function(cond) {
message("Throws error. Here's the original error message:")
message(cond)
}
)Vahey et al stated that their meta analytic model weighted by N. However, selection models, at least when implemented using metafor, do not allow for use of models with custom weights (ie other than inverse variance).
I therefore refitted the meta analysis model without the custom weights (i.e., weighted by inverse variance instead).
fit_temp <-
rma(yi = yi,
vi = vi,
#weights = ni, # Hunter Schmidt method requires weighting by N
method = "HS", # Hunter Schmidt method
data = data_recalculated_variance_transformed,
slab = article_abbreviated,
digits = 10)
# code taken from metafor::selmodel() to implement vevea & woods (2005) selection models as described in Vahey et al. (2015)
tab <- data.frame(
steps = c(0.005, 0.01, 0.05, 0.10, 0.25, 0.35, 0.50, 0.65, 0.75, 0.90, 0.95, 0.99, 0.995, 1),
delta.mod.1 = c(1, 0.99, 0.95, 0.80, 0.75, 0.65, 0.60, 0.55, 0.50, 0.50, 0.50, 0.50, 0.50, 0.50),
delta.mod.2 = c(1, 0.99, 0.95, 0.90, 0.80, 0.75, 0.60, 0.60, 0.75, 0.80, 0.90, 0.95, 0.99, 1.00),
delta.sev.1 = c(1, 0.99, 0.90, 0.75, 0.60, 0.50, 0.40, 0.35, 0.30, 0.25, 0.10, 0.10, 0.10, 0.10),
delta.sev.2 = c(1, 0.99, 0.90, 0.75, 0.60, 0.50, 0.25, 0.25, 0.50, 0.60, 0.75, 0.90, 0.99, 1.00))
sel <- lapply(
tab[-1], function(delta) {
selmodel(fit_temp,
type = "stepfun",
steps = tab$steps,
delta = delta)
}
)
res <- transf.ztor(c(fit_temp$beta, sapply(sel, function(x) x$beta)))
deltas = res - res[1]
tibble(model = c("moderate onetailed selection", "moderate two-tailed selection", "severe onetailed selection", "severe two-tailed selection"),
delta_es_vahey = c(.007, .007, .016, .016),
abs_delta_es_recalculated = abs(janitor::round_half_up(deltas[-1], 3)),
es_recalculated = janitor::round_half_up(res[-1], 3)) |>
mutate(reproduced = delta_es_vahey==abs_delta_es_recalculated) |>
kable() |>
kable_classic(full_width = FALSE)| model | delta_es_vahey | abs_delta_es_recalculated | es_recalculated | reproduced |
|---|---|---|---|---|
| moderate onetailed selection | 0.007 | 0.010 | 0.461 | FALSE |
| moderate two-tailed selection | 0.007 | 0.007 | 0.464 | TRUE |
| severe onetailed selection | 0.016 | 0.016 | 0.455 | TRUE |
| severe two-tailed selection | 0.016 | 0.017 | 0.455 | FALSE |
Meta r in refitted model = 0.47.
Numeric values of the differences in effect size reproduced in 2 of 4 cases, on the assumption that the differences are absolute differences. Magnitude of the differences are in line with Vahey et al.’s.
However, the use of a section model such as this cannot “confirm” the absence of publication bias. It merely presents a corrected meta effect size in light of very specific and untestable assumptions.
Carter et al. (2019) “Correcting for Bias in Psychology: A Comparison of Meta-Analytic Methods” compared the Vevea & Woods (2005) selection model, which they refer to as 3PSM (three parameter selection model) with other bias detection and correction methods in a large scale simulation study. They concluded that no method is a silver bullet, and all perform poorly under some conditions, and that these conditions are generally untestable in real data. They recommend that multiple methods should be reported, and that ultimately only improvements to the primary literature will increase the quality of our evidence. Without attempting to examine this exhaustively, I illustrate how choice of bias correction method can influence results by calculating one other method method considered by Carter et al. (2019): the PET correction. In terms of implementation, the PET correction simply includes the standard error as moderator within the meta-analytic model.
# fit model
fit_reproduced_transformed_pet <-
rma(yi = yi,
vi = vi,
mods = sqrt(vi), # PET model adds moderation by SE
weights = ni, # Hunter Schmidt method requires weighting by N
method = "HS", # Hunter Schmidt method
data = data_recalculated_variance_transformed,
slab = article_abbreviated,
digits = 10)
predictions_reproduced_transformed_pet <-
predict(fit_reproduced_transformed_pet) %>%
as.data.frame() %>%
select(-se) %>%
# cr following Field & Gillett (2010) equation 5, where s_r^2 == tau2 as they both estimate the variance in population correlations. e.g., see Field's script "h_s syntax.sps" which refers to this as both tau and s_r^2 depending on the meta analytic approach
mutate(cr.lb = pred - 1.96*sqrt(fit_reproduced_transformed_pet$tau2),
cr.ub = pred + 1.96*sqrt(fit_reproduced_transformed_pet$tau2))
tibble(meta_r = transf.ztor(fit_reproduced_transformed_pet$beta[1]),
ci_lower = transf.ztor(fit_reproduced_transformed_pet$ci.lb[1]),
ci_upper = transf.ztor(fit_reproduced_transformed_pet$ci.ub[1]),
p = fit_reproduced_transformed_pet$pval[1]) |>
mutate(delta_base_vs_pet = meta_r - fit_reproduced_transformed$beta) |>
round_df(2) |>
kable() |>
kable_classic(full_width = FALSE)| meta_r | ci_lower | ci_upper | p | delta_base_vs_pet |
|---|---|---|---|---|
| 0.36 | -0.04 | 0.66 | 0.08 | -0.16 |
Reported in forest plot, calculated from individual effect sizes in supplementary materials.
Next, I attempted to reproduce the weighted-mean effect sizes reported in Vahey et al.’s forest plot from the individual effect sizes they reported in their supplementary materials.
Vahey et al. reported that they weighted by degrees of freedom, and I do the same.
I noted that some of the confidence intervals in Vahey et al.’s forest plot are asymmetrical around the point estimate. This is uncommon and not accounted for by Vahey et al. detailing of how they calculated the effect sizes and their confidence intervals. However, I take them at face value as they are the most detailed data available to work from.
# recalculate and compare
data_weighted_effect_sizes <- data_vahey %>%
select(article = article_abbreviated,
r_supplementary,
df_supplementary) %>%
group_by(article) %>%
summarize(r_recalculated_weighted_mean = round_half_up(weighted.mean(x = r_supplementary, w = df_supplementary), 2)) %>%
ungroup() %>%
left_join(data_vahey %>%
select(article = article_abbreviated,
mean_r_forest_plot_numeric) %>%
drop_na(),
by = "article") %>%
mutate(congruence = ifelse(janitor::round_half_up(r_recalculated_weighted_mean, 2) == janitor::round_half_up(mean_r_forest_plot_numeric, 2), "Congruent", "Incongruent"),
congruence = fct_relevel(congruence, "Congruent", "Incongruent"),
difference = r_recalculated_weighted_mean - mean_r_forest_plot_numeric)
# plot
p_comparison_reaveraged <-
ggplot(data_weighted_effect_sizes, aes(mean_r_forest_plot_numeric, r_recalculated_weighted_mean)) +
geom_abline(slope = 1, linetype = "dotted") +
geom_point(aes(color = congruence, shape = congruence), size = 2.25) +
theme_classic() +
scale_color_viridis_d(begin = 0.25, end = 0.75) +
theme(legend.position = "top", # c(0.25, 0.85),
legend.title = element_blank()) +
#legend.box.background = element_rect(colour = "black")) +
xlim(0.35, 0.75) +
ylim(0.35, 0.75) +
xlab("Reported in Vahey et al.'s (2015)\nforest plot") +
ylab("Recalculated from Vahey et al.'s (2015)\nsupplementary materials") +
annotate("text", x = .66, y = .45, size = 3, label = "Overestimates validity") +
annotate("text", x = .45, y = .66, size = 3, label = "Underestimates validity")
p_comparison_reaveragedggplot2::ggsave(filename = "plots/scatter plot comparing effect sizes in forest plot to reaveraged from supplementary materials.pdf",
plot = p_comparison_reaveraged,
device = "pdf",
width = 4.5,
height = 4.5,
units = "in")
# table
data_weighted_effect_sizes %>%
summarize(n_incongruent = sum(ifelse(congruence == "Incongruent", TRUE, FALSE)),
percent_incongruent = round_half_up(mean(ifelse(congruence == "Incongruent", TRUE, FALSE)), 2)*100) %>%
kable() %>%
kable_classic(full_width = FALSE)| n_incongruent | percent_incongruent |
|---|---|
| 2 | 13 |
data_weighted_effect_sizes %>%
filter(difference != 0) %>%
select(article, r_recalculated_weighted_mean, mean_r_forest_plot_numeric, difference, congruence) %>%
round_df(2) %>%
kable() %>%
kable_classic(full_width = FALSE)| article | r_recalculated_weighted_mean | mean_r_forest_plot_numeric | difference | congruence |
|---|---|---|---|---|
| Nicholson, Dempsey et al. (2013) | 0.46 | 0.48 | -0.02 | Incongruent |
| Vahey et al. (2009) | 0.58 | 0.53 | 0.05 | Incongruent |
The weighted effect sizes reported in Vahey et al.’s forest plot could not be computationally reproduced from the individual effect sizes and degrees of freedom reported in their supplementary materials.
Many values were congruent, but a minority differed (k = 2 [13%]). Table includes those that differed.
Laken’s (2016) describes incorrect inclusions as inclusion of effect sizes that do not meet the inclusion criteria. Vahey et al. (2015) stated that the purpose of their meta-analysis was to “quantify how much IRAP effects from clinically-relevant responding co-vary with corresponding clinically-relevant criterion variables” (p.60). Their inclusion criterion was that “the IRAP and criterion variables must have been deemed to target some aspect of a condition included in a major psychiatric diagnostic scheme such as the Diagnostic and Statistical Manual of Mental Disorders (DSM-5, 2013) … The authors decided whether the responses measured by a given IRAP trial-type should co-vary with a specific criterion variable by consulting the relevant empirical literature.” (p.60). References to neither the specific clinical condition that was targeted by the IRAP and the criterion variable nor the specific empirical literature that Vahey et al. (2015) used to justify the inclusion of each criterion were provided in the original article or supplementary materials. Nonetheless, Vahey’s own inclusion criterion required that effects referred to covariation between an IRAP and an external clinically-relevant criterion variable, consistent with the APA dictionary of psychology definition of criterion validity (REF).
data_vahey %>%
count(involves_external_criterion) %>%
kable() %>%
kable_classic(full_width = FALSE)| involves_external_criterion | n |
|---|---|
| FALSE | 23 |
| TRUE | 33 |
“Collectively, these 15 articles yielded 56 statistical effects between various clinically focused IRAP effects and their respective criterion variables. … of the 56 statistical effects only 8 (i.e. 14%) were not initially cited by both authors for inclusion.” (Vahey et al., 2015, p. 60)
Vahey et al. did not report how many effect sizes they excluded.
I reviewed these same 15 articles and followed Vahey et al.’s (2015) same inclusion criteria (i.e., like Vahey, I did not limit myself to effect sizes reported in the articles but also considered ones implied within the articles. Like Vahey et al., where necessary I contacted the original authors to obtain additional information about effect sizes).
I extracted 308 effect sizes. Of these, 255 met the inclusion criterion of being a criterion effect. Of these, 171 met the inclusion criterion of being clincially relevant by Vahey et al.’s definition. This was 3.1 times the number extracted by Vahey et al.
data_raters <- data_reextracted_criterion %>%
select(clinically_relevant_criterion_rater_1, clinically_relevant_criterion_rater_2)
data_raters %>%
mutate(agreement = ifelse(clinically_relevant_criterion_rater_1 == clinically_relevant_criterion_rater_2, TRUE, FALSE)) %>%
summarize(interrater_percent_agreement = round_half_up(mean(agreement, na.rm = TRUE), 1)) %>%
kable() %>%
kable_classic(full_width = FALSE)| interrater_percent_agreement |
|---|
| 0.9 |
kappa2(data_raters, "unweighted")## Cohen's Kappa for 2 Raters (Weights: unweighted)
##
## Subjects = 255
## Raters = 2
## Kappa = 0.872
##
## z = 14
## p-value = 0
Excluding non-clinically relevant and effects that do not involve a relationship with an external variable, which cannot provide evidence of criterion validity.
Details of the meta-analytic strategy, which used contemporary methods:
total_n_new <- data_reextracted_criterion_clinically_relevant %>%
distinct(article, n_from_paper) %>%
group_by(article) %>%
filter(n_from_paper == max(n_from_paper)) %>%
ungroup() %>%
summarize(total_n = sum(n_from_paper, na.rm = TRUE)) %>%
pull(total_n)
# exclusions and transformations
data_for_meta_new <- data_reextracted_criterion_clinically_relevant %>%
mutate(variables = paste(irap_variable, criterion_variable, sep = "-")) %>%
dplyr::select(article, variables, r_from_paper, n_from_paper, authors_include_bh) %>%
na.omit() %>%
escalc(measure = "ZCOR",
ri = r_from_paper,
ni = n_from_paper,
data = .,
slab = paste(article, variables),
digits = 10) %>%
rename(ni = n_from_paper) %>%
dplyr::select(article, variables, yi, vi, ni, authors_include_bh) %>%
na.omit() %>%
group_by(article) %>%
mutate(es_number = row_number(),
article_abbreviated = paste(article, es_number)) %>%
ungroup() %>%
mutate(lower = yi - sqrt(vi)*1.96,
upper = yi + sqrt(vi)*1.96)
# fit
fit_new <-
rma.mv(yi = yi,
V = vi,
random = ~ 1 | article,
method = "REML",
data = data_for_meta_new,
slab = article_abbreviated,
digits = 10)
predictions_new <-
predict(fit_new) %>%
as.data.frame() %>%
select(-se) %>%
# cr following Field & Gillett (2010) equation 5
mutate(cr.lb = pred - 1.96*sqrt(fit_new$tau2),
cr.ub = pred + 1.96*sqrt(fit_new$tau2))
predictions_new_backtransformed <- predictions_new %>%
mutate_all(transf.ztor) %>%
round_df(2)
# plot
data_forest_new <- data_for_meta_new %>%
drop_na() %>%
select(article_abbreviated,
n = ni,
r = yi,
lower,
upper) %>%
# bind_rows(tibble(article_abbreviated = "Meta",
# n = 35, # arbitrary number to make the diamond an appropriate size
# r = predictions_new$pred,
# lower = predictions_new$ci.lb,
# upper = predictions_new$ci.ub)) %>%
bind_rows(tibble(article_abbreviated = c("Meta-analysis (3-level RE confidence interval)",
"Meta-analysis (3-level RE credibility interval)",
"Meta-analysis (3-level RE prediction interval)"),
n = 35, # arbitrary number to make the diamond an appropriate size
r = c(predictions_new$pred,
predictions_new$pred,
predictions_new$pred),
lower = c(predictions_new$ci.lb,
predictions_new$cr.lb,
predictions_new$pi.lb),
upper = c(predictions_new$ci.ub,
predictions_new$cr.ub,
predictions_new$pi.ub))) %>%
mutate(r = transf.ztor(r),
lower = transf.ztor(lower),
upper = transf.ztor(upper)) %>%
mutate(size = n/sum(n),
n = ifelse(str_detect(article_abbreviated, "Meta"), total_n_new, n),
article_abbreviated = fct_relevel(article_abbreviated,
"Meta-analysis (3-level RE confidence interval)",
"Meta-analysis (3-level RE credibility interval)",
"Meta-analysis (3-level RE prediction interval)")) %>%
mutate(` ` = paste(rep(" ", 35), collapse = " ")) %>%
select(Article = article_abbreviated, ` `, r, lower, upper, n, size) %>%
mutate(r = round_half_up(r, 2),
lower = round_half_up(lower, 2),
upper = round_half_up(upper, 2))
p_new <-
forestploter::forest(data = select(data_forest_new, -size),
est = data_forest_new$r,
lower = data_forest_new$lower,
upper = data_forest_new$upper,
sizes = 1/data_forest_new$size,
# is_summary = c(rep(FALSE, nrow(data_forest_new)-1), TRUE),
is_summary = c(rep(FALSE, nrow(data_forest_new)-3),
rep(TRUE, 3)),
ci_column = 2,
ref_line = 0,
theme = custom_theme,
xlim = c(-0.7, 1.1),
ticks_at = c(-.6, -.4, -.2, 0, .2, .4, .6, .8, 1.0))
p_newggplot2::ggsave(filename = "plots/forest plot new multilevel model.pdf",
plot = p_new,
device = "pdf",
width = 8,
height = 39,
units = "in")Meta effect: k = 171, r = 0.22, 95% CI [0.15, 0.29], 95% CR [0.22, 0.22], 95% PI [-0.01, 0.42], p = 0.000000004.
data_power_new <- data_power_verified %>%
mutate(effect_size_new = c(predictions_new_backtransformed$pred,
predictions_new_backtransformed$ci.lb,
predictions_new_backtransformed$pred,
predictions_new_backtransformed$ci.lb,
r_to_d(predictions_new_backtransformed$pred),
r_to_d(predictions_new_backtransformed$ci.lb),
r_to_d(predictions_new_backtransformed$pred),
r_to_d(predictions_new_backtransformed$ci.lb)),
effect_size_new = round_half_up(effect_size_new, 2))
point_estimate_new <- predictions_new_backtransformed$pred
lower_ci_new <- predictions_new_backtransformed$ci.lb
data_power_new$required_n_new[1] <-
pwr.r.test(n = NULL,
r = point_estimate_new,
sig.level = 0.05,
power = 0.8,
alternative = "greater")$n %>%
ceiling()
data_power_new$required_n_new[2] <-
pwr.r.test(n = NULL,
r = lower_ci_new,
sig.level = 0.05,
power = 0.8,
alternative = "greater")$n %>%
ceiling()
data_power_new$required_n_new[3] <-
pwr.r.test(n = NULL,
r = point_estimate_new,
sig.level = 0.05,
power = 0.8,
alternative = "two.sided")$n %>%
ceiling()
data_power_new$required_n_new[4] <-
pwr.r.test(n = NULL,
r = lower_ci_new,
sig.level = 0.05,
power = 0.8,
alternative = "two.sided")$n %>%
ceiling()
data_power_new$required_n_new[5] <-
pwr.t.test(n = NULL,
d = r_to_d(point_estimate_new),
type = "two.sample",
sig.level = 0.05,
power = 0.8,
alternative = "greater")$n %>%
ceiling()*2
data_power_new$required_n_new[6] <-
pwr.t.test(n = NULL,
d = r_to_d(lower_ci_new),
type = "two.sample",
sig.level = 0.05,
power = 0.8,
alternative = "greater")$n %>%
ceiling()*2
data_power_new$required_n_new[7] <-
pwr.t.test(n = NULL,
d = r_to_d(point_estimate_new),
type = "paired",
sig.level = 0.05,
power = 0.8,
alternative = "greater")$n %>%
ceiling()
data_power_new$required_n_new[8] <-
pwr.t.test(n = NULL,
d = r_to_d(lower_ci_new),
type = "paired",
sig.level = 0.05,
power = 0.8,
alternative = "greater")$n %>%
ceiling()
# print
data_power_new %>%
kable() %>%
kable_classic(full_width = FALSE)| test | tails | estimate | effect_size_vahey | required_n_vahey | required_n_verified | effect_size_new | required_n_new |
|---|---|---|---|---|---|---|---|
| Pearson’s r | One-tailed | Point estimate | 0.45 | 29 | 29 | 0.22 | 126 |
| Pearson’s r | One-tailed | Lower bound of 95% Confidence Interval | 0.40 | 37 | 37 | 0.15 | 273 |
| Pearson’s r | Two-tailed | Point estimate | 0.45 | 36 | 36 | 0.22 | 160 |
| Pearson’s r | Two-tailed | Lower bound of 95% Confidence Interval | 0.40 | NA | 46 | 0.15 | 346 |
| Independent t-test (Cohen’s d) | One-tailed | Point estimate | 1.01 | 26 | 26 | 0.45 | 124 |
| Independent t-test (Cohen’s d) | One-tailed | Lower bound of 95% Confidence Interval | 0.87 | 36 | 34 | 0.30 | 270 |
| Dependent t-test (Cohen’s d) | One-tailed | Point estimate | 1.01 | 8 | 8 | 0.45 | 32 |
| Dependent t-test (Cohen’s d) | One-tailed | Lower bound of 95% Confidence Interval | 0.87 | 10 | 10 | 0.30 | 69 |
# compare power analysis sample sizes to the published literature
data_power_new %>%
rowwise() %>%
mutate(percent_of_publications_meeting_required_n_verified = janitor::round_half_up(mean(data_sample_sizes_vector > required_n_verified)*100, 1),
percent_of_publications_meeting_required_n_new = janitor::round_half_up(mean(data_sample_sizes_vector > required_n_new)*100, 1)) %>%
ungroup() %>%
select(test, tails,
required_n_verified,
percent_of_publications_meeting_required_n_verified,
required_n_new,
percent_of_publications_meeting_required_n_new) %>%
kable() %>%
kable_classic(full_width = FALSE)| test | tails | required_n_verified | percent_of_publications_meeting_required_n_verified | required_n_new | percent_of_publications_meeting_required_n_new |
|---|---|---|---|---|---|
| Pearson’s r | One-tailed | 29 | 75.5 | 126 | 2.1 |
| Pearson’s r | One-tailed | 37 | 54.3 | 273 | 0.0 |
| Pearson’s r | Two-tailed | 36 | 55.9 | 160 | 1.1 |
| Pearson’s r | Two-tailed | 46 | 39.4 | 346 | 0.0 |
| Independent t-test (Cohen’s d) | One-tailed | 26 | 77.7 | 124 | 2.1 |
| Independent t-test (Cohen’s d) | One-tailed | 34 | 59.0 | 270 | 0.0 |
| Dependent t-test (Cohen’s d) | One-tailed | 8 | 99.5 | 32 | 62.2 |
| Dependent t-test (Cohen’s d) | One-tailed | 10 | 98.9 | 69 | 11.7 |
data_sample_sizes |>
distinct(Title, study_design_ignoring_trial_type_comparisons) |>
count(study_design_ignoring_trial_type_comparisons) |>
mutate(percent = janitor::round_half_up(n/sum(n)*100, 1)) |>
kable() |>
kable_classic(full_width = FALSE)| study_design_ignoring_trial_type_comparisons | n | percent |
|---|---|---|
| between | 74 | 48.1 |
| mixed | 32 | 20.8 |
| within | 48 | 31.2 |